In [ ]:
import pandas as pd 
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold,  GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklift.models import SoloModel, TwoModels, ClassTransformation, ClassTransformationReg
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import swifter
from tqdm import tqdm
from statsmodels.stats.weightstats  import CompareMeans
import gc
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)
from sklift.viz import  plot_qini_curve, plot_uplift_curve
pd.options.display.max_columns = None

Object For Preprocessing¶

In [ ]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
class PrepocessingDataSetLetu():
    def __init__(self,  cat_vars = ['lftm_type','SEGMENT_N'
                              #  ,	'SUB_EMAIL'
                              #  , 'Тип карты'	
                                , 'Канал регистрации'
                                , 'NewComer', 'FEDERAL_DATE'] 
                ,  for_norm = ['last_order_days',	'AMOUNT'
                                ,	'ORDERS'
                                ,	'AOV'	
                                ,'LTV_1M'
                                , 'AO_per_month'
                                , 'GET_Bonus'
                                ,	'USE_Bonus'
                                ,	'NON_SPEND_BONUS'
                                ,	'BASE_GET_Bonus'
                                ,	'BASE_USE_Bonus'
                                ,	'Express_GET_Bonus'
                                ,	'Express_USE_Bonus'
                                ,	'Camp_GET_Bonus'
                                ,	'REG_GET_Bonus'	
                                , 'Camp_BASE_GET_Bonus'
                                ,	'REG_BASE_GET_Bonus'
                                , 'AVG_USE_BONUS'
                                ,	'DISC_AMOUNT'
                                ,	'DISC_ORDERS'
                                , 'BASE_AVG_USE_BONUS'
                                ,	'BASE_DISC_AMOUNT'
                                ,	'BASE_DISC_ORDERS'
                                , 'Express_AVG_USE_BONUS',	'Express_DISC_AMOUNT',	'Express_DISC_ORDERS'
                                ,'min_days_between', 'max_days_between', 'avg_days_between', 'month']
                , age = 'AGE'
               
            ):
            self.cat_vars = cat_vars
            self.for_norm = for_norm
            self.age = age
            self.scaler = None#StandardScaler()
            self.ohe = None#OneHotEncoder()
            self.ohe_solomodel = None#OneHotEncoder()
            self.med = None


    def remove_colls(self, colls):
        for c in colls:
            if c in self.for_norm:
                self.for_norm.remove(c)
            if c in self.cat_vars:
                self.cat_vars.remove(c)

    def fit(self, df):
        Y_var = df[['target','treat']]
        X_var = df.drop(['target','treat'], axis = 1)
        self.ohe = OneHotEncoder()
        self.scaler = StandardScaler()
        self.ohe_solomodel = OneHotEncoder()
        self.ohe.fit(X_var[self.cat_vars].values)
        self.med = X_var[self.age].median()
        self.scaler.fit(X_var[self.for_norm].values)
        self.ohe_solomodel.fit(Y_var.treat.values.reshape(-1, 1))
        del X_var, Y_var


    def preprocessing_data(self, df):
                        Y_var = df[['target','treat']]
                        X_var = df.drop(['target','treat'], axis = 1)
                        dataset = pd.DataFrame()
                        


                        ## Categorical to Vec
                        if(self.ohe is None):
                            self.ohe = OneHotEncoder()
                            res = self.ohe.fit_transform(X_var[self.cat_vars].values).toarray()
                        else:
                            res = self.ohe.transform(X_var[self.cat_vars].values).toarray()
                        
                        dataset = pd.concat([dataset, pd.DataFrame(res)], axis = 1)
                        X_var = X_var.drop(self.cat_vars, axis =1)                    
                       #dataset.head()

                        ## Numeric preprocessing
                        if(self.med is None):
                            self.med = X_var[self.age].median()

                        res =  X_var[self.age].apply(lambda x: x if((x >= 16) and (x <= 90)) else self.med).fillna(self.med)
                        res = (res / 100).values                                            
                        dataset = pd.concat([dataset, pd.DataFrame(res)], axis = 1)                                            
                        X_var = X_var.drop([self.age], axis =1).fillna(0)                  

                        if(self.scaler is None):
                            self.scaler = StandardScaler()
                            res = self.scaler.fit_transform(X_var[self.for_norm].values)
                        else:
                            res = self.scaler.transform(X_var[self.for_norm].values)

                        dataset = pd.concat([dataset, pd.DataFrame(res)], axis = 1)
                        X_var = X_var.drop(self.for_norm, axis =1)      
                        dataset = pd.concat([dataset
                                        , pd.DataFrame(X_var.values)] , axis = 1)
                        


                        del X_var   
                        gc.collect()
                        cols = ['var_'+str(i) for i in range(dataset.shape[1])]
                        dataset.columns = cols
                        dataset.index = df.index
                        return dataset, Y_var  


    
    def preprocessing_data_solomodel(self, df):
                        Y_var = df[['target','treat']]
                        X_var = df.drop(['target','treat'], axis = 1)
                        dataset = pd.DataFrame()
                        
                        ## Categorical to Vec
                        if(self.ohe is None):
                            self.ohe = OneHotEncoder()
                            res = self.ohe.fit_transform(X_var[self.cat_vars].values).toarray()
                        else:
                            res = self.ohe.transform(X_var[self.cat_vars].values).toarray()
                        
                        dataset = pd.concat([dataset, pd.DataFrame(res)], axis = 1)
                        X_var = X_var.drop(self.cat_vars, axis =1)                    
                       #dataset.head()

                        ## Numeric preprocessing
                        if(self.med is None):
                            self.med = X_var[self.age].median()

                        res =  X_var[self.age].apply(lambda x: x if((x >= 16) and (x <= 90)) else self.med).fillna(self.med)
                        res = (res / 100).values    
                        dataset = pd.concat([dataset, pd.DataFrame(res)], axis = 1)                                            
                        X_var = X_var.drop([self.age], axis =1).fillna(0)                  

                        if(self.scaler is None):
                            self.scaler = StandardScaler()
                            res = self.scaler.fit_transform(X_var[self.for_norm].values)
                        else:
                            res = self.scaler.transform(X_var[self.for_norm].values)

                        dataset = pd.concat([dataset, pd.DataFrame(res)], axis = 1)
                        X_var = X_var.drop(self.for_norm, axis =1)      
                        dataset = pd.concat([dataset
                                        , pd.DataFrame(X_var.values)] , axis = 1)
                        del X_var   
                        gc.collect()


                        if(self.ohe_solomodel is None):
                            self.ohe_solomodel = OneHotEncoder()
                            res = self.ohe_solomodel.fit_transform(Y_var.treat.values.reshape(-1, 1)).toarray()
                        else:
                            res = self.ohe_solomodel.transform(Y_var.treat.values.reshape(-1, 1)).toarray()
                        
                        dataset = pd.concat([dataset, pd.DataFrame(res)], axis = 1)

                        cols = ['var_'+str(i) for i in range(dataset.shape[1])]
                        dataset.columns = cols
                        dataset.index = df.index
                        return dataset, Y_var
In [ ]:
def get_metrics(target, treat, uplift, k):
    return uplift_at_k(target, uplift, treat, strategy='overall', k=k) , uplift_at_k(target, uplift, treat, strategy='by_group', k=k) , qini_auc_score(target, uplift, treat), uplift_auc_score(target, uplift, treat)


def get_report(target, treat, uplift, plot_path = '', k =0.3):
    """
    Return:
        * uplift_at_k_overall
        * plift_at_k_group
        * qini_score
        * uplift_score

        Print qini, uplift plots
    """
    uplift_overall, uplift_group, qini_score, uplift_score = get_metrics(target, treat, uplift, k)                   
    print(f'UpLift at 30%: overall - {round(uplift_overall, 6)}, grouped - {round(uplift_group, 6)}.\nQini AUC Score: {round(qini_score, 6)}; UpLift AUC Score: {round(uplift_score, 6)}')

    fig, ax = plt.subplots(1, 2, figsize = (30, 10))
    ax[0].set_title('Qini Curve')
    ax[1].set_title('UpLift Curve')
    plot_qini_curve(target , uplift, treatment=treat, ax = ax[0], perfect=False)
    gc.collect()
    plot_uplift_curve(target , uplift, treatment=treat, ax = ax[1], perfect=False)
    gc.collect()
    gc.collect()
    gc.collect()
    plt.savefig(f"Plots/Base model Split.png")
    return uplift_overall, uplift_group, qini_score, uplift_score
In [ ]:
def code_seasons(mont):
          #dt = pd.to_datetime(dt
      #     with open('log_months.txt', 'a') as f:
      #            f.write(str(mont))
      #            f.write('\n\n')
          if pd.isna(mont):
                 return pd.NA
          if (mont > 12) or (mont < 1):
                 return pd.NA
          if mont in (12, 2, 3):
                return 4
          if mont in (11, 10, 1):
                return 3
          if mont in (4, 5, 6):
                return 1
          if mont in (7, 8, 9):
                return 2
          
In [ ]:
 

Compare means, vars¶

In [ ]:
df= pd.read_csv(r'uplift_dataset.zip'
                    , sep = ';'
                    , encoding = 'ANSI'
                #     , dtype = {'NPL':str
                #             , 'SEGMENT_N':str}
                , index_col=['NPL']).drop(['FEDERAL_DATE', 'Group', 'Send_DAte'], axis = 1)#.set_index('NPL')
df_subm = pd.read_csv(r'uplift_subm_dataset.zip'
                    , sep = ';'
                    , encoding = 'ANSI'
                #     , dtype = {'NPL':str
                #             , 'SEGMENT_N':str}
                , index_col=['NPL']).drop(['FEDERAL_DATE', 'Group', 'Send_DAte'], axis = 1)#.set_index('NPL')



df.index.name = None
df_subm.index.name = None
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[6], line 1
----> 1 df= pd.read_csv(r'uplift_dataset.zip'
      2                     , sep = ';'
      3                     , encoding = 'ANSI'
      4                 #     , dtype = {'NPL':str
      5                 #             , 'SEGMENT_N':str}
      6                 , index_col=['NPL']).drop(['FEDERAL_DATE', 'Group', 'Send_DAte'], axis = 1)#.set_index('NPL')
      7 df_subm = pd.read_csv(r'uplift_subm_dataset.zip'
      8                     , sep = ';'
      9                     , encoding = 'ANSI'
     10                 #     , dtype = {'NPL':str
     11                 #             , 'SEGMENT_N':str}
     12                 , index_col=['NPL']).drop(['FEDERAL_DATE', 'Group', 'Send_DAte'], axis = 1)#.set_index('NPL')
     16 df.index.name = None

File c:\Users\Kirill\.conda\envs\automl\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    209     else:
    210         kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File c:\Users\Kirill\.conda\envs\automl\lib\site-packages\pandas\util\_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)

File c:\Users\Kirill\.conda\envs\automl\lib\site-packages\pandas\io\parsers\readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    946     defaults={"delimiter": ","},
    947 )
    948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)

File c:\Users\Kirill\.conda\envs\automl\lib\site-packages\pandas\io\parsers\readers.py:605, in _read(filepath_or_buffer, kwds)
    602 _validate_names(kwds.get("names", None))
    604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
    607 if chunksize or iterator:
    608     return parser

File c:\Users\Kirill\.conda\envs\automl\lib\site-packages\pandas\io\parsers\readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
   1439     self.options["has_index_names"] = kwds["has_index_names"]
   1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)

File c:\Users\Kirill\.conda\envs\automl\lib\site-packages\pandas\io\parsers\readers.py:1735, in TextFileReader._make_engine(self, f, engine)
   1733     if "b" not in mode:
   1734         mode += "b"
-> 1735 self.handles = get_handle(
   1736     f,
   1737     mode,
   1738     encoding=self.options.get("encoding", None),
   1739     compression=self.options.get("compression", None),
   1740     memory_map=self.options.get("memory_map", False),
   1741     is_text=is_text,
   1742     errors=self.options.get("encoding_errors", "strict"),
   1743     storage_options=self.options.get("storage_options", None),
   1744 )
   1745 assert self.handles is not None
   1746 f = self.handles.handle

File c:\Users\Kirill\.conda\envs\automl\lib\site-packages\pandas\io\common.py:779, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    774 # ZIP Compression
    775 elif compression == "zip":
    776     # error: Argument 1 to "_BytesZipFile" has incompatible type
    777     # "Union[str, BaseBuffer]"; expected "Union[Union[str, PathLike[str]],
    778     # ReadBuffer[bytes], WriteBuffer[bytes]]"
--> 779     handle = _BytesZipFile(
    780         handle, ioargs.mode, **compression_args  # type: ignore[arg-type]
    781     )
    782     if handle.buffer.mode == "r":
    783         handles.append(handle)

File c:\Users\Kirill\.conda\envs\automl\lib\site-packages\pandas\io\common.py:1022, in _BytesZipFile.__init__(self, file, mode, archive_name, **kwargs)
   1018 kwargs.setdefault("compression", zipfile.ZIP_DEFLATED)
   1019 # error: Argument 1 to "ZipFile" has incompatible type "Union[
   1020 # Union[str, PathLike[str]], ReadBuffer[bytes], WriteBuffer[bytes]]";
   1021 # expected "Union[Union[str, PathLike[str]], IO[bytes]]"
-> 1022 self.buffer = zipfile.ZipFile(file, mode, **kwargs)

File c:\Users\Kirill\.conda\envs\automl\lib\zipfile.py:1251, in ZipFile.__init__(self, file, mode, compression, allowZip64, compresslevel, strict_timestamps)
   1249 while True:
   1250     try:
-> 1251         self.fp = io.open(file, filemode)
   1252     except OSError:
   1253         if filemode in modeDict:

FileNotFoundError: [Errno 2] No such file or directory: 'uplift_dataset.zip'
In [ ]:
df['Тип карты'].value_counts()
Out[ ]:
Как карточное платежное средство    3141649
Без возможности оплаты              1794950
Name: Тип карты, dtype: int64
In [ ]:
df['SUB_EMAIL'] = df['SUB_EMAIL'].swifter.apply(lambda x: 1 if x == 'Да' else 0)
df['lftm_type'] = df['lftm_type'].swifter.apply(lambda x: 1 if x =='NewComer' else 0)
df['Тип карты'] = df['Тип карты'].swifter.apply(lambda x: 1 if x == 'Как карточное платежное средство' else 0)
df['Канал регистрации'] = df['Канал регистрации'].swifter.apply(lambda x: 1 if x =='Розница' else 0)

df_subm['SUB_EMAIL'] = df_subm['SUB_EMAIL'].swifter.apply(lambda x: 1 if x == 'Да' else 0)
df_subm['lftm_type'] = df_subm['lftm_type'].swifter.apply(lambda x: 1 if x =='NewComer' else 0)
df_subm['Тип карты'] = df_subm['Тип карты'].swifter.apply(lambda x: 1 if x == 'Как карточное платежное средство' else 0)
df_subm['Канал регистрации'] = df_subm['Канал регистрации'].swifter.apply(lambda x: 1 if x =='Розница' else 0)
c:\Users\Kirill\.conda\envs\automl\lib\site-packages\swifter\swifter.py:87: UserWarning: This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`.
  warnings.warn(
Pandas Apply:   0%|          | 0/4936599 [00:00<?, ?it/s]
c:\Users\Kirill\.conda\envs\automl\lib\site-packages\swifter\swifter.py:87: UserWarning: This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`.
  warnings.warn(
Pandas Apply:   0%|          | 0/4936599 [00:00<?, ?it/s]
c:\Users\Kirill\.conda\envs\automl\lib\site-packages\swifter\swifter.py:87: UserWarning: This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`.
  warnings.warn(
Pandas Apply:   0%|          | 0/4936599 [00:00<?, ?it/s]
c:\Users\Kirill\.conda\envs\automl\lib\site-packages\swifter\swifter.py:87: UserWarning: This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`.
  warnings.warn(
Pandas Apply:   0%|          | 0/4936599 [00:00<?, ?it/s]
c:\Users\Kirill\.conda\envs\automl\lib\site-packages\swifter\swifter.py:87: UserWarning: This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`.
  warnings.warn(
Pandas Apply:   0%|          | 0/672331 [00:00<?, ?it/s]
c:\Users\Kirill\.conda\envs\automl\lib\site-packages\swifter\swifter.py:87: UserWarning: This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`.
  warnings.warn(
Pandas Apply:   0%|          | 0/672331 [00:00<?, ?it/s]
c:\Users\Kirill\.conda\envs\automl\lib\site-packages\swifter\swifter.py:87: UserWarning: This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`.
  warnings.warn(
Pandas Apply:   0%|          | 0/672331 [00:00<?, ?it/s]
c:\Users\Kirill\.conda\envs\automl\lib\site-packages\swifter\swifter.py:87: UserWarning: This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`.
  warnings.warn(
Pandas Apply:   0%|          | 0/672331 [00:00<?, ?it/s]
In [ ]:
display(df.describe())
display(df_subm.describe())
SEGMENT_N SUB_EMAIL target treat lftm_type NewComer Тип карты Канал регистрации AGE AMOUNT ORDERS AOV last_order_days LTV_1M AO_per_month GET_Bonus USE_Bonus NON_SPEND_BONUS BASE_GET_Bonus BASE_USE_Bonus Express_GET_Bonus Express_USE_Bonus Camp_GET_Bonus REG_GET_Bonus Camp_BASE_GET_Bonus REG_BASE_GET_Bonus AVG_DIS AVG_USE_BONUS DISC_AMOUNT DISC_ORDERS BASE_AVG_DIS BASE_AVG_USE_BONUS BASE_DISC_AMOUNT BASE_DISC_ORDERS Express_AVG_DIS Express_AVG_USE_BONUS Express_DISC_AMOUNT Express_DISC_ORDERS min_days_between max_days_between avg_days_between
count 4.936599e+06 4.936599e+06 4.936599e+06 4.936599e+06 4.936599e+06 4.936599e+06 4.936599e+06 4.936599e+06 2.361936e+06 3.965534e+06 4.924956e+06 3.965534e+06 4.010567e+06 3.965534e+06 4.924956e+06 4.924931e+06 1.752073e+06 1.752048e+06 3.972821e+06 490812.000000 4.909328e+06 1.683254e+06 67532.000000 3.965533e+06 67532.000000 3.965533e+06 1.752073e+06 1.752073e+06 1.688375e+06 1.752073e+06 490812.000000 490812.000000 459433.000000 490812.000000 1.683254e+06 1.683254e+06 1.603112e+06 1.683254e+06 1.369972e+06 1.369972e+06 1.369972e+06
mean 2.102686e+00 5.416395e-01 7.076350e-02 9.045957e-01 9.942412e-01 3.618461e-01 6.363995e-01 5.491961e-01 3.668345e+01 3.870770e+03 1.338891e+00 2.568206e+03 4.640763e+01 9.676926e+02 3.347227e-01 6.673301e+02 4.581450e+02 3.300309e+02 5.077126e+01 93.949150 6.283649e+02 4.494819e+02 338.617826 3.993899e+01 338.617826 3.993899e+01 2.778764e-01 3.727992e+02 2.987464e+03 1.295198e+00 0.106659 68.145312 3059.505649 1.263364 2.986837e-01 3.996113e+02 2.684032e+03 1.152093e+00 -2.917466e+01 -1.769976e+01 -2.259933e+01
std 1.802472e+00 4.982632e-01 2.564294e-01 2.937727e-01 7.566809e-02 4.805346e-01 4.810356e-01 4.975739e-01 1.384153e+01 5.685045e+03 1.408313e+00 3.125496e+03 3.061397e+01 1.421261e+03 3.520784e-01 3.555825e+02 3.079639e+02 2.990090e+02 1.252251e+02 266.839348 3.321920e+02 2.483220e+02 439.071044 5.743451e+01 439.071044 5.743451e+01 2.211580e-01 1.689051e+02 3.979060e+03 7.359323e-01 0.234130 130.084268 4470.432253 0.695822 2.297041e-01 1.523434e+02 3.271385e+03 5.076997e-01 2.515927e+01 2.342498e+01 2.232014e+01
min -1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 -1.320000e+02 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00 0.000000e+00 -5.000000e+02 1.000000e+00 -1.919100e+04 -4.520000e+02 0.160000 -5.000000e+02 1.000000e+00 0.000000 0.000000e+00 0.000000 0.000000e+00 4.800000e-05 1.000000e+00 0.000000e+00 0.000000e+00 0.000021 0.160000 0.000000 0.000000 1.740000e-04 1.000000e+00 0.000000e+00 0.000000e+00 -1.200000e+02 -1.200000e+02 -1.200000e+02
25% 1.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 2.800000e+01 7.970000e+02 1.000000e+00 6.040000e+02 2.000000e+01 1.992500e+02 2.500000e-01 3.890000e+02 3.000000e+02 5.400000e+01 9.000000e+00 11.000000 3.000000e+02 3.000000e+02 144.000000 9.000000e+00 144.000000 9.000000e+00 9.525600e-02 3.000000e+02 6.132500e+02 1.000000e+00 0.008535 10.500000 673.850000 1.000000 1.088610e-01 3.000000e+02 5.830000e+02 1.000000e+00 -4.600000e+01 -2.800000e+01 -3.200000e+01
50% 1.000000e+00 1.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 3.600000e+01 2.118700e+03 1.000000e+00 1.439400e+03 4.200000e+01 5.296750e+02 2.500000e-01 5.430000e+02 5.000000e+02 3.090000e+02 2.300000e+01 32.000000 5.000000e+02 5.000000e+02 218.000000 2.300000e+01 218.000000 2.300000e+01 2.194420e-01 3.000000e+02 1.650000e+03 1.000000e+00 0.018980 28.000000 1576.000000 1.000000 2.461085e-01 3.980000e+02 1.516000e+03 1.000000e+00 -2.500000e+01 -7.000000e+00 -1.700000e+01
75% 4.000000e+00 1.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 4.400000e+01 5.071000e+03 2.000000e+00 3.496000e+03 7.000000e+01 1.267750e+03 5.000000e-01 8.460000e+02 5.000000e+02 5.120000e+02 5.400000e+01 78.000000 8.000000e+02 5.000000e+02 470.000000 5.200000e+01 470.000000 5.200000e+01 4.497430e-01 5.000000e+02 4.044000e+03 1.000000e+00 0.060657 63.000000 3888.750000 1.000000 4.716980e-01 5.000000e+02 3.720400e+03 1.000000e+00 -6.000000e+00 0.000000e+00 -5.000000e+00
max 8.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.220000e+02 1.164658e+06 3.440000e+02 1.817081e+05 1.230000e+02 2.911646e+05 8.600000e+01 7.295800e+04 9.214900e+04 1.375900e+04 7.165800e+04 91649.000000 3.811000e+04 3.711000e+04 51312.000000 1.173200e+04 51312.000000 1.173200e+04 1.000000e+00 1.023878e+04 3.074038e+05 5.500000e+01 1.000000 11579.500000 307403.800000 36.000000 1.000000e+00 4.201000e+03 2.140524e+05 5.500000e+01 0.000000e+00 0.000000e+00 0.000000e+00
SEGMENT_N SUB_EMAIL target treat lftm_type NewComer Тип карты Канал регистрации AGE AMOUNT ORDERS AOV last_order_days LTV_1M AO_per_month GET_Bonus USE_Bonus NON_SPEND_BONUS BASE_GET_Bonus BASE_USE_Bonus Express_GET_Bonus Express_USE_Bonus Camp_GET_Bonus REG_GET_Bonus Camp_BASE_GET_Bonus REG_BASE_GET_Bonus AVG_DIS AVG_USE_BONUS DISC_AMOUNT DISC_ORDERS BASE_AVG_DIS BASE_AVG_USE_BONUS BASE_DISC_AMOUNT BASE_DISC_ORDERS Express_AVG_DIS Express_AVG_USE_BONUS Express_DISC_AMOUNT Express_DISC_ORDERS min_days_between max_days_between avg_days_between
count 672331.000000 672331.000000 672331.000000 672331.000000 672331.000000 672331.000000 672331.000000 672331.000000 331092.000000 483658.000000 600169.000000 483658.000000 488789.000000 483658.000000 600169.000000 599761.000000 257146.000000 256738.000000 485352.000000 111350.000000 514547.000000 225433.000000 13077.000000 483658.000000 13077.000000 483658.000000 257146.000000 257146.000000 248551.000000 257146.000000 111350.000000 111350.000000 100669.000000 111350.000000 225433.000000 225433.000000 217959.000000 225433.000000 206564.000000 206564.000000 206564.000000
mean 2.106132 0.588002 0.075958 0.906503 0.805007 0.398873 0.686702 0.483118 36.470356 4152.327901 1.506834 2481.633738 59.143682 1038.081975 0.376708 603.659295 419.713824 256.108941 55.470268 82.047984 651.308232 438.230818 312.368892 42.883207 312.368892 42.883207 0.248066 311.461608 3118.272710 1.412641 0.122682 58.132009 3149.681807 1.305819 0.278010 358.854413 2697.024404 1.244942 -35.312320 -20.528045 -26.690241
std 1.800421 0.492195 0.264931 0.291128 0.396196 0.489667 0.463835 0.499715 13.660333 5677.174007 1.533454 2949.195685 28.047024 1419.293502 0.383364 423.810788 335.313893 401.664944 117.255265 172.090627 375.943722 295.111924 305.908251 57.372828 305.908251 57.372828 0.217362 178.015652 4062.963324 0.865727 0.259087 106.297405 4296.324696 0.715556 0.215228 153.578484 3234.438491 0.656707 26.065131 25.571211 23.511745
min -1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -132.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 -3486.000000 0.040000 -6986.000000 -480.000000 0.040000 -3500.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000091 0.040000 0.000000 0.000000 0.000066 0.040000 0.000000 0.000000 0.000125 1.000000 0.000000 0.000000 -121.000000 -121.000000 -121.000000
25% 1.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000 28.000000 884.800000 1.000000 619.567500 36.000000 221.200000 0.250000 305.000000 300.000000 15.000000 10.000000 13.000000 300.000000 300.000000 149.000000 10.000000 149.000000 10.000000 0.074374 210.000000 661.000000 1.000000 0.009734 11.000000 696.750000 1.000000 0.098220 300.000000 599.000000 1.000000 -54.000000 -33.000000 -39.000000
50% 1.000000 1.000000 0.000000 1.000000 1.000000 0.000000 1.000000 0.000000 36.000000 2380.750000 1.000000 1461.850000 59.000000 595.187500 0.250000 540.000000 300.000000 306.000000 26.000000 34.000000 600.000000 300.000000 226.000000 25.000000 226.000000 25.000000 0.179748 300.000000 1769.400000 1.000000 0.021012 28.000000 1774.000000 1.000000 0.225580 300.000000 1552.000000 1.000000 -33.000000 -9.000000 -21.000000
75% 4.000000 1.000000 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 44.000000 5423.500000 2.000000 3359.400000 79.000000 1355.875000 0.500000 879.000000 500.000000 507.000000 57.000000 76.000000 900.000000 500.000000 396.000000 56.000000 396.000000 56.000000 0.401687 397.000000 4211.000000 2.000000 0.065359 59.000000 4033.000000 1.000000 0.438052 500.000000 3760.000000 1.000000 -13.000000 0.000000 -8.000000
max 8.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 122.000000 353298.000000 75.000000 152911.550000 122.000000 88324.500000 18.750000 18023.000000 16523.000000 5292.000000 9050.000000 8537.000000 18023.000000 16523.000000 6380.000000 3537.000000 6380.000000 3537.000000 1.000000 7000.000000 201940.000000 29.000000 1.000000 6940.000000 134681.700000 21.000000 1.000000 3500.000000 125716.000000 25.000000 0.000000 0.000000 0.000000
In [ ]:
def ttest_means(x_mean = .0, y_mean = .0, x_std = 1.0, y_std = 1.0, n1 = 2, n2 = 1):
    from scipy.stats import t
    S = np.sqrt(((n1 - 1) * x_std**2 +  (n2 - 1) * y_std**2) - 2)
    T = (x_mean - y_mean) / (S * np.sqrt(1 / n1 + 1 / n2))
    rv = t(df = n1 + n2 - 2)
    return rv.sf(np.abs(T)) * 2, T


def chi_vars(x_var = 1.0, y_var = 1.0, n1 = 2, n2 = 1):
    from scipy.stats import f
    F = x_var/y_var #calculate F test statistic 
    dfn = n1-1 #define degrees of freedom numerator 
    dfd = n2-1 #define degrees of freedom denominator 
    p = 1- f.cdf(F, dfn, dfd) #find p-value of F test statistic 
    return p , F   
In [ ]:
colls = df.columns.tolist()
res = {}
stats = {}
for coll in colls:
    p_m, T = ttest_means(df[coll].mean()
                        , df_subm[coll].mean()
                        , df[coll].std()
                        , df_subm[coll].std()
                        , df[coll].dropna().shape[0]
                        , df_subm[coll].dropna().shape[0])
    p_v, F = chi_vars(df[coll].std(ddof = 1)
                        , df_subm[coll].std(ddof = 1)
                        , df[coll].dropna().shape[0]
                        , df_subm[coll].dropna().shape[0])
    res[coll] = {'p_means': round(p_m, 4)
                    , 'p_vars': round(p_v, 4)}
    stats[coll] = (T, F, round(df[coll].mean(), 4)
                        , round(df_subm[coll].mean(), 4)
                        , round(df[coll].std(), 4)
                        , round(df_subm[coll].std(), 4)
                        , round(df[coll].dropna().shape[0] + df_subm[coll].dropna().shape[0] - 2 , 4)
                        , round(df[coll].std(ddof = 1), 4)
                        , round(df_subm[coll].std(ddof = 1), 4))
In [ ]:
# res
In [ ]:
# stats
In [ ]:
# res_sort = sorted(res.items(), key = lambda x: x[1])
# res_sort
In [ ]:
colls_to_drop = [k for k,v in res.items() if (v['p_means'] <= 0.05) or (v['p_vars'] <= 0.05)
            ]


if 'treat' in  colls_to_drop:
    colls_to_drop.remove('treat')

if 'target' in  colls_to_drop:
    colls_to_drop.remove('target')
colls_to_drop
Out[ ]:
['SUB_EMAIL',
 'Тип карты',
 'AGE',
 'AOV',
 'last_order_days',
 'BASE_GET_Bonus',
 'BASE_USE_Bonus',
 'Camp_GET_Bonus',
 'Camp_BASE_GET_Bonus',
 'AVG_DIS',
 'BASE_AVG_USE_BONUS',
 'BASE_DISC_AMOUNT',
 'Express_AVG_DIS',
 'Express_DISC_AMOUNT']
In [ ]:
colls_to_drop = []
In [ ]:
del df
del df_subm
gc.collect()
Out[ ]:
131

pass¶

In [ ]:
 
In [ ]:
colls_to_drop = []

Load Data¶

In [ ]:
df= pd.read_csv(r'uplift_alls_dataset.zip'
                    , sep = ';'
                    , encoding = 'ANSI'
                    , dtype = {'NPL':str
                            , 'SEGMENT_N':str}).drop(['NPL', 'Group', 'Send_DAte', 'SUB_EMAIL', 'Тип карты'] + colls_to_drop, axis = 1)#.set_index('NPL')
df['month'] = pd.to_datetime(df.FEDERAL_DATE).dt.month
display(df.head())
C:\Users\Kirill\AppData\Local\Temp\ipykernel_6800\3037996996.py:1: DtypeWarning: Columns (12) have mixed types. Specify dtype option on import or set low_memory=False.
  df= pd.read_csv(r'uplift_alls_dataset.zip'
FEDERAL_DATE SEGMENT_N target treat lftm_type NewComer Канал регистрации AGE AMOUNT ORDERS AOV last_order_days LTV_1M AO_per_month GET_Bonus USE_Bonus NON_SPEND_BONUS BASE_GET_Bonus BASE_USE_Bonus Express_GET_Bonus Express_USE_Bonus Camp_GET_Bonus REG_GET_Bonus Camp_BASE_GET_Bonus REG_BASE_GET_Bonus AVG_DIS AVG_USE_BONUS DISC_AMOUNT DISC_ORDERS BASE_AVG_DIS BASE_AVG_USE_BONUS BASE_DISC_AMOUNT BASE_DISC_ORDERS Express_AVG_DIS Express_AVG_USE_BONUS Express_DISC_AMOUNT Express_DISC_ORDERS min_days_between max_days_between avg_days_between month
0 2022-11-01 1 1 1 NewComer 0 Ecomm 33.0 NaN 0.0 NaN NaN NaN 0.00 1000.0 NaN NaN NaN NaN 1000.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11
1 2022-11-01 1 0 1 NewComer 0 Розница 61.0 5424.50 1.0 5424.500000 43.0 1356.1250 0.25 355.0 NaN NaN 55.0 NaN 300.0 NaN NaN 55.0 NaN 55.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11
2 2022-11-01 3 0 0 NewComer 1 Розница NaN 517.00 3.0 172.333333 52.0 129.2500 0.75 507.0 463.0 44.0 7.0 NaN 500.0 463.0 NaN 7.0 NaN 7.0 0.498922 463.0 465.00 1.0 NaN NaN NaN NaN 0.498922 463.0 465.00 1.0 -13.0 0.0 -6.0 11
3 2022-11-01 -1 0 1 NewComer 1 Ecomm NaN NaN 0.0 NaN NaN NaN 0.00 800.0 NaN NaN NaN NaN 800.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11
4 2022-11-01 1 1 1 NewComer 0 Ecomm 57.0 2143.85 1.0 2143.850000 18.0 535.9625 0.25 323.0 300.0 23.0 23.0 NaN 300.0 300.0 NaN 23.0 NaN 23.0 0.122757 300.0 2143.85 1.0 NaN NaN NaN NaN 0.122757 300.0 2143.85 1.0 NaN NaN NaN 11
In [ ]:
display(df.head())
FEDERAL_DATE SEGMENT_N target treat lftm_type NewComer Канал регистрации AGE AMOUNT ORDERS AOV last_order_days LTV_1M AO_per_month GET_Bonus USE_Bonus NON_SPEND_BONUS BASE_GET_Bonus BASE_USE_Bonus Express_GET_Bonus Express_USE_Bonus Camp_GET_Bonus REG_GET_Bonus Camp_BASE_GET_Bonus REG_BASE_GET_Bonus AVG_DIS AVG_USE_BONUS DISC_AMOUNT DISC_ORDERS BASE_AVG_DIS BASE_AVG_USE_BONUS BASE_DISC_AMOUNT BASE_DISC_ORDERS Express_AVG_DIS Express_AVG_USE_BONUS Express_DISC_AMOUNT Express_DISC_ORDERS min_days_between max_days_between avg_days_between month
0 2022-11-01 1 1 1 NewComer 0 Ecomm 33.0 NaN 0.0 NaN NaN NaN 0.00 1000.0 NaN NaN NaN NaN 1000.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11
1 2022-11-01 1 0 1 NewComer 0 Розница 61.0 5424.50 1.0 5424.500000 43.0 1356.1250 0.25 355.0 NaN NaN 55.0 NaN 300.0 NaN NaN 55.0 NaN 55.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11
2 2022-11-01 3 0 0 NewComer 1 Розница NaN 517.00 3.0 172.333333 52.0 129.2500 0.75 507.0 463.0 44.0 7.0 NaN 500.0 463.0 NaN 7.0 NaN 7.0 0.498922 463.0 465.00 1.0 NaN NaN NaN NaN 0.498922 463.0 465.00 1.0 -13.0 0.0 -6.0 11
3 2022-11-01 -1 0 1 NewComer 1 Ecomm NaN NaN 0.0 NaN NaN NaN 0.00 800.0 NaN NaN NaN NaN 800.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11
4 2022-11-01 1 1 1 NewComer 0 Ecomm 57.0 2143.85 1.0 2143.850000 18.0 535.9625 0.25 323.0 300.0 23.0 23.0 NaN 300.0 300.0 NaN 23.0 NaN 23.0 0.122757 300.0 2143.85 1.0 NaN NaN NaN NaN 0.122757 300.0 2143.85 1.0 NaN NaN NaN 11
In [ ]:
df.FEDERAL_DATE = df.month.swifter.apply(code_seasons, axis = 1).astype(int)
display(df.head())
c:\Users\Kirill\.conda\envs\automl\lib\site-packages\swifter\swifter.py:305: UserWarning: Axis keyword not necessary because applying on a Series.
  warnings.warn("Axis keyword not necessary because applying on a Series.")
Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]
FEDERAL_DATE SEGMENT_N target treat lftm_type NewComer Канал регистрации AGE AMOUNT ORDERS AOV last_order_days LTV_1M AO_per_month GET_Bonus USE_Bonus NON_SPEND_BONUS BASE_GET_Bonus BASE_USE_Bonus Express_GET_Bonus Express_USE_Bonus Camp_GET_Bonus REG_GET_Bonus Camp_BASE_GET_Bonus REG_BASE_GET_Bonus AVG_DIS AVG_USE_BONUS DISC_AMOUNT DISC_ORDERS BASE_AVG_DIS BASE_AVG_USE_BONUS BASE_DISC_AMOUNT BASE_DISC_ORDERS Express_AVG_DIS Express_AVG_USE_BONUS Express_DISC_AMOUNT Express_DISC_ORDERS min_days_between max_days_between avg_days_between month
0 3 1 1 1 NewComer 0 Ecomm 33.0 NaN 0.0 NaN NaN NaN 0.00 1000.0 NaN NaN NaN NaN 1000.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11
1 3 1 0 1 NewComer 0 Розница 61.0 5424.50 1.0 5424.500000 43.0 1356.1250 0.25 355.0 NaN NaN 55.0 NaN 300.0 NaN NaN 55.0 NaN 55.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11
2 3 3 0 0 NewComer 1 Розница NaN 517.00 3.0 172.333333 52.0 129.2500 0.75 507.0 463.0 44.0 7.0 NaN 500.0 463.0 NaN 7.0 NaN 7.0 0.498922 463.0 465.00 1.0 NaN NaN NaN NaN 0.498922 463.0 465.00 1.0 -13.0 0.0 -6.0 11
3 3 -1 0 1 NewComer 1 Ecomm NaN NaN 0.0 NaN NaN NaN 0.00 800.0 NaN NaN NaN NaN 800.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 11
4 3 1 1 1 NewComer 0 Ecomm 57.0 2143.85 1.0 2143.850000 18.0 535.9625 0.25 323.0 300.0 23.0 23.0 NaN 300.0 300.0 NaN 23.0 NaN 23.0 0.122757 300.0 2143.85 1.0 NaN NaN NaN NaN 0.122757 300.0 2143.85 1.0 NaN NaN NaN 11

Базовая модель¶

In [ ]:
proc_data = PrepocessingDataSetLetu()
proc_data.remove_colls(colls_to_drop)
X_data, Y_data = proc_data.preprocessing_data(df)
display(X_data.head()) 
display(Y_data.head()) 
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 var_10 var_11 var_12 var_13 var_14 var_15 var_16 var_17 var_18 var_19 var_20 var_21 var_22 var_23 var_24 var_25 var_26 var_27 var_28 var_29 var_30 var_31 var_32 var_33 var_34 var_35 var_36 var_37 var_38 var_39 var_40 var_41 var_42 var_43 var_44 var_45 var_46 var_47 var_48 var_49
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.33 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 1.025470 -0.566303 -0.422688 -0.364185 -0.122348 1.170560 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
1 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.61 0.188076 0.494648 -0.194261 1.211479 0.494648 -0.194261 -0.644546 -0.566303 -0.422688 0.163204 -0.122348 -0.717262 -0.581921 -0.07909 0.485022 -0.07909 0.485022 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.36 0.463681 -0.473833 1.238512 -0.607338 -0.473833 1.238512 -0.250992 1.127489 -0.243014 -0.297063 -0.122348 -0.177884 1.274518 -0.07909 -0.450530 -0.07909 -0.450530 1.788794 -0.219971 0.657498 -0.158142 -0.189061 -0.308908 1.741966 -0.189340 0.918137 -0.209564 0.243203 -0.023623 -1.046476 0.498922 0.0 0.498922
3 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.36 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 0.507636 -0.566303 -0.422688 -0.364185 -0.122348 0.631182 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
4 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.57 -0.577494 -0.152779 -0.194261 0.075395 -0.152779 -0.194261 -0.727400 0.531186 -0.328768 -0.143641 -0.122348 -0.717262 0.620956 -0.07909 -0.138680 -0.07909 -0.138680 0.929366 0.390969 0.657498 -0.158142 -0.189061 -0.308908 0.899231 0.556416 0.918137 0.394758 0.243203 0.344056 -1.046476 0.122757 0.0 0.122757
target treat
0 1 1
1 0 1
2 0 0
3 0 1
4 1 1
In [ ]:
treat = Y_data.treat.values
y = Y_data.target.values

from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)

from sklift.viz import plot_qini_curve, plot_uplift_curve, plot_uplift_by_percentile
import matplotlib.pyplot as plt


y_pred = np.random.uniform(-1, 1, len(treat)).tolist()
gc.collect()
uplift_overall, uplift_group, qini_score, uplift_score = get_report(y, treat, y_pred, 'Plots/Base model Split.png')
UpLift at 30%: overall - 0.006719, grouped - 0.006729.
Qini AUC Score: 0.000516; UpLift AUC Score: 0.000329
In [ ]:
#del df
del X_data
del Y_data
del treat
del y
gc.collect()
Out[ ]:
7862

Метод одной модели¶

In [ ]:
# df= pd.read_csv(r'uplift_alls_dataset.zip'
#                     , sep = ';'
#                     , encoding = 'ANSI'
#                     , dtype = {'NPL':str
#                             , 'SEGMENT_N':str}, index_col=['NPL']).drop([ 'Group', 'Send_DAte', 'SUB_EMAIL', 'Тип карты'] + colls_to_drop, axis = 1)#.set_index('NPL')
# df.index.name = None
# df.FEDERAL_DATE = pd.to_datetime(df.FEDERAL_DATE).swifter.apply(code_seasons)
X_data, Y_data = proc_data.preprocessing_data(df)
treat = Y_data.treat.values
X = X_data.values
y = Y_data.target.values
display(X_data.head()) 
display(Y_data.head()) 
gc.collect()
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 var_10 var_11 var_12 var_13 var_14 var_15 var_16 var_17 var_18 var_19 var_20 var_21 var_22 var_23 var_24 var_25 var_26 var_27 var_28 var_29 var_30 var_31 var_32 var_33 var_34 var_35 var_36 var_37 var_38 var_39 var_40 var_41 var_42 var_43 var_44 var_45 var_46 var_47 var_48 var_49
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.33 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 1.025470 -0.566303 -0.422688 -0.364185 -0.122348 1.170560 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
1 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.61 0.188076 0.494648 -0.194261 1.211479 0.494648 -0.194261 -0.644546 -0.566303 -0.422688 0.163204 -0.122348 -0.717262 -0.581921 -0.07909 0.485022 -0.07909 0.485022 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.36 0.463681 -0.473833 1.238512 -0.607338 -0.473833 1.238512 -0.250992 1.127489 -0.243014 -0.297063 -0.122348 -0.177884 1.274518 -0.07909 -0.450530 -0.07909 -0.450530 1.788794 -0.219971 0.657498 -0.158142 -0.189061 -0.308908 1.741966 -0.189340 0.918137 -0.209564 0.243203 -0.023623 -1.046476 0.498922 0.0 0.498922
3 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.36 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 0.507636 -0.566303 -0.422688 -0.364185 -0.122348 0.631182 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
4 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.57 -0.577494 -0.152779 -0.194261 0.075395 -0.152779 -0.194261 -0.727400 0.531186 -0.328768 -0.143641 -0.122348 -0.717262 0.620956 -0.07909 -0.138680 -0.07909 -0.138680 0.929366 0.390969 0.657498 -0.158142 -0.189061 -0.308908 0.899231 0.556416 0.918137 0.394758 0.243203 0.344056 -1.046476 0.122757 0.0 0.122757
target treat
0 1 1
1 0 1
2 0 0
3 0 1
4 1 1
Out[ ]:
0
In [ ]:
#Default data
skf = StratifiedKFold(n_splits=5, random_state= 42, shuffle=True)
skf.get_n_splits(X, y)
metrics = {'uplift_k_group':[],  'uplift_k_overall':[], 'qini_score':[], 'uplift_score':[]}
i = 0


for train_index, test_index in skf.split(X, y):
      gc.collect()
      X_train_, X_test_ = X[train_index], X[test_index]
      y_train_, y_test_ = y[train_index], y[test_index]
      catboost_clf = CatBoostClassifier(verbose = 0)
      clf = SoloModel(estimator=catboost_clf)
      clf.fit(X_train_, y_train_, treatment = treat[train_index])
      y_pred = clf.predict(X_test_)
      del clf
      del X_train_
      del y_train_
      del X_test_
      gc.collect()
      uplift_overall, uplift_group, qini_score, uplift_score = get_report(y_test_, treat[test_index], y_pred, f"Plots/One model Split {i}.png")
      metrics['uplift_k_group'].append(uplift_group)
      metrics['uplift_k_overall'].append(uplift_overall)
      metrics[ 'qini_score'].append(qini_score)
      metrics['uplift_score'].append(uplift_score)
      i += 1
    
uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.010261, grouped - 0.010864.
Qini AUC Score: 0.011339; UpLift AUC Score: 0.007151
UpLift at 30%: overall - 0.010511, grouped - 0.011366.
Qini AUC Score: 0.00949; UpLift AUC Score: 0.005961
UpLift at 30%: overall - 0.00824, grouped - 0.009345.
Qini AUC Score: 0.004253; UpLift AUC Score: 0.002654
UpLift at 30%: overall - 0.008911, grouped - 0.009985.
Qini AUC Score: 0.011079; UpLift AUC Score: 0.007012
UpLift at 30%: overall - 0.01003, grouped - 0.010685.
Qini AUC Score: 0.010827; UpLift AUC Score: 0.006826
UpLift at 30%: overall - 0.0096, grouped - 0.0104.
Qini AUC Score: 0.0094; UpLift AUC Score: 0.0059
In [ ]:
uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.0096, grouped - 0.0104.
Qini AUC Score: 0.0094; UpLift AUC Score: 0.0059
In [ ]:
gc.collect()
Out[ ]:
37418
In [ ]:
del X_data
del Y_data
del X
del treat
#del df
del y
gc.collect()
Out[ ]:
0

Метод двух моделей¶

In [ ]:
# df= pd.read_csv(r'uplift_dataset.zip'
#                     , sep = ';'
#                     , encoding = 'ANSI'
#                     , dtype = {'NPL':str
#                             , 'SEGMENT_N':str}, index_col=['NPL']).drop(['FEDERAL_DATE', 'Group', 'Send_DAte']  + colls_to_drop, axis = 1)#.set_index('NPL')
# df.index.name = None
X_data, Y_data = proc_data.preprocessing_data(df)
display(X_data.head()) 
display(Y_data.head()) 
treat = Y_data.treat.values
X = X_data.values
y = Y_data.target.values
gc.collect()
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)

from sklift.viz import plot_qini_curve, plot_uplift_curve, plot_uplift_by_percentile
import matplotlib.pyplot as plt
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 var_10 var_11 var_12 var_13 var_14 var_15 var_16 var_17 var_18 var_19 var_20 var_21 var_22 var_23 var_24 var_25 var_26 var_27 var_28 var_29 var_30 var_31 var_32 var_33 var_34 var_35 var_36 var_37 var_38 var_39 var_40 var_41 var_42 var_43 var_44 var_45 var_46 var_47 var_48 var_49
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.33 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 1.025470 -0.566303 -0.422688 -0.364185 -0.122348 1.170560 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
1 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.61 0.188076 0.494648 -0.194261 1.211479 0.494648 -0.194261 -0.644546 -0.566303 -0.422688 0.163204 -0.122348 -0.717262 -0.581921 -0.07909 0.485022 -0.07909 0.485022 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.36 0.463681 -0.473833 1.238512 -0.607338 -0.473833 1.238512 -0.250992 1.127489 -0.243014 -0.297063 -0.122348 -0.177884 1.274518 -0.07909 -0.450530 -0.07909 -0.450530 1.788794 -0.219971 0.657498 -0.158142 -0.189061 -0.308908 1.741966 -0.189340 0.918137 -0.209564 0.243203 -0.023623 -1.046476 0.498922 0.0 0.498922
3 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.36 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 0.507636 -0.566303 -0.422688 -0.364185 -0.122348 0.631182 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
4 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.57 -0.577494 -0.152779 -0.194261 0.075395 -0.152779 -0.194261 -0.727400 0.531186 -0.328768 -0.143641 -0.122348 -0.717262 0.620956 -0.07909 -0.138680 -0.07909 -0.138680 0.929366 0.390969 0.657498 -0.158142 -0.189061 -0.308908 0.899231 0.556416 0.918137 0.394758 0.243203 0.344056 -1.046476 0.122757 0.0 0.122757
target treat
0 1 1
1 0 1
2 0 0
3 0 1
4 1 1
In [ ]:
#Default data
skf = StratifiedKFold(n_splits=5, random_state= 42, shuffle=True)
skf.get_n_splits(X, y)
metrics = {'uplift_k_group':[],  'uplift_k_overall':[], 'qini_score':[], 'uplift_score':[]}
i = 0

for train_index, test_index in skf.split(X, y):
      gc.collect()
      X_train_, X_test_ = X[train_index], X[test_index]
      y_train_, y_test_ = y[train_index], y[test_index]
      catboost_clf0 = CatBoostClassifier(verbose = 0)
      catboost_clf1 = CatBoostClassifier(verbose = 0)
      clf = TwoModels(estimator_ctrl= catboost_clf0, estimator_trmnt=catboost_clf1)
      clf.fit(X_train_, y_train_, treatment = treat[train_index])
      y_pred = clf.predict(X_test_)
      del catboost_clf0
      del catboost_clf1
      del X_train_
      del y_train_
      del X_test_
      gc.collect()
      uplift_overall, uplift_group, qini_score, uplift_score = get_report(y_test_, treat[test_index], y_pred, f"Plots/Two models Split {i}.png")
      metrics['uplift_k_group'].append(uplift_group)
      metrics['uplift_k_overall'].append(uplift_overall)
      metrics[ 'qini_score'].append(qini_score)
      metrics['uplift_score'].append(uplift_score)
      i += 1



uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.008684, grouped - 0.008853.
Qini AUC Score: 0.004217; UpLift AUC Score: 0.002676
UpLift at 30%: overall - 0.009608, grouped - 0.01021.
Qini AUC Score: 0.006914; UpLift AUC Score: 0.004371
UpLift at 30%: overall - 0.008285, grouped - 0.008974.
Qini AUC Score: 0.003883; UpLift AUC Score: 0.002453
UpLift at 30%: overall - 0.008518, grouped - 0.009476.
Qini AUC Score: 0.009343; UpLift AUC Score: 0.005938
UpLift at 30%: overall - 0.007892, grouped - 0.008263.
Qini AUC Score: 0.003991; UpLift AUC Score: 0.002527
UpLift at 30%: overall - 0.0086, grouped - 0.0092.
Qini AUC Score: 0.0057; UpLift AUC Score: 0.0036
In [ ]:
uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.0086, grouped - 0.0092.
Qini AUC Score: 0.0057; UpLift AUC Score: 0.0036
In [ ]:
del X_data
del Y_data
del X
del treat
#del df
del y
gc.collect()
Out[ ]:
37418

Метод трансформации класса (Классификация)¶

In [ ]:
# df= pd.read_csv(r'uplift_dataset.zip'
#                     , sep = ';'
#                     , encoding = 'ANSI'
#                     , dtype = {'NPL':str
#                             , 'SEGMENT_N':str}, index_col=['NPL']).drop(['FEDERAL_DATE', 'Group', 'Send_DAte']  + colls_to_drop, axis = 1)#.set_index('NPL')
# df.index.name = None
X_data, Y_data = proc_data.preprocessing_data(df)
treat = Y_data.treat.values
X = X_data.values
y = Y_data.target.values
p = Y_data[Y_data.treat == 1].shape[0] / Y_data.shape[0]
display(X_data.head()) 
display(Y_data.head()) 
gc.collect()
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)

from sklift.viz import plot_qini_curve, plot_uplift_curve, plot_uplift_by_percentile
import matplotlib.pyplot as plt
print(p)
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 var_10 var_11 var_12 var_13 var_14 var_15 var_16 var_17 var_18 var_19 var_20 var_21 var_22 var_23 var_24 var_25 var_26 var_27 var_28 var_29 var_30 var_31 var_32 var_33 var_34 var_35 var_36 var_37 var_38 var_39 var_40 var_41 var_42 var_43 var_44 var_45 var_46 var_47 var_48 var_49
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.33 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 1.025470 -0.566303 -0.422688 -0.364185 -0.122348 1.170560 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
1 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.61 0.188076 0.494648 -0.194261 1.211479 0.494648 -0.194261 -0.644546 -0.566303 -0.422688 0.163204 -0.122348 -0.717262 -0.581921 -0.07909 0.485022 -0.07909 0.485022 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.36 0.463681 -0.473833 1.238512 -0.607338 -0.473833 1.238512 -0.250992 1.127489 -0.243014 -0.297063 -0.122348 -0.177884 1.274518 -0.07909 -0.450530 -0.07909 -0.450530 1.788794 -0.219971 0.657498 -0.158142 -0.189061 -0.308908 1.741966 -0.189340 0.918137 -0.209564 0.243203 -0.023623 -1.046476 0.498922 0.0 0.498922
3 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.36 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 0.507636 -0.566303 -0.422688 -0.364185 -0.122348 0.631182 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
4 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.57 -0.577494 -0.152779 -0.194261 0.075395 -0.152779 -0.194261 -0.727400 0.531186 -0.328768 -0.143641 -0.122348 -0.717262 0.620956 -0.07909 -0.138680 -0.07909 -0.138680 0.929366 0.390969 0.657498 -0.158142 -0.189061 -0.308908 0.899231 0.556416 0.918137 0.394758 0.243203 0.344056 -1.046476 0.122757 0.0 0.122757
target treat
0 1 1
1 0 1
2 0 0
3 0 1
4 1 1
0.9044035089907759
In [ ]:
#Default data
skf = StratifiedKFold(n_splits=5, random_state= 42, shuffle=True)
skf.get_n_splits(X, y)
metrics = {'uplift_k_group':[],  'uplift_k_overall':[], 'qini_score':[], 'uplift_score':[]}
i = 0


for train_index, test_index in skf.split(X, y):
      gc.collect()
      X_train_, X_test_ = X[train_index], X[test_index]
      y_train_, y_test_ = y[train_index], y[test_index]
      catboost_clf = CatBoostClassifier(verbose = 0)
      clf = ClassTransformation(estimator=catboost_clf)
      clf.fit(X_train_, y_train_, treatment = treat[train_index])
      y_pred = clf.predict(X_test_)
      del clf
      del X_train_
      del y_train_
      del X_test_
      gc.collect()
      uplift_overall, uplift_group, qini_score, uplift_score = get_report(y_test_, treat[test_index], y_pred, f"Plots/Trasnform Class model Split {i}.png")
      metrics['uplift_k_group'].append(uplift_group)
      metrics['uplift_k_overall'].append(uplift_overall)
      metrics[ 'qini_score'].append(qini_score)
      metrics['uplift_score'].append(uplift_score)
      i += 1


uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.012387, grouped - 0.01221.
Qini AUC Score: 0.017297; UpLift AUC Score: 0.01098
UpLift at 30%: overall - 0.010416, grouped - 0.010408.
Qini AUC Score: 0.012826; UpLift AUC Score: 0.008124
UpLift at 30%: overall - 0.010632, grouped - 0.010685.
Qini AUC Score: 0.011703; UpLift AUC Score: 0.007423
UpLift at 30%: overall - 0.011065, grouped - 0.011115.
Qini AUC Score: 0.016368; UpLift AUC Score: 0.010422
UpLift at 30%: overall - 0.01146, grouped - 0.011584.
Qini AUC Score: 0.016832; UpLift AUC Score: 0.010673
UpLift at 30%: overall - 0.0112, grouped - 0.0112.
Qini AUC Score: 0.015; UpLift AUC Score: 0.0095
In [ ]:
uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.0112, grouped - 0.0112.
Qini AUC Score: 0.015; UpLift AUC Score: 0.0095
In [ ]:
del X_data
del Y_data
del X
del treat
#del df
del y
gc.collect()
Out[ ]:
37418

Метод трансформации класса (Регрессия)¶

In [ ]:
# df= pd.read_csv(r'uplift_dataset.zip'
#                     , sep = ';'
#                     , encoding = 'ANSI'
#                     , dtype = {'NPL':str
#                             , 'SEGMENT_N':str}, index_col=['NPL']).drop(['FEDERAL_DATE', 'Group', 'Send_DAte']  + colls_to_drop, axis = 1)#.set_index('NPL')
# df.index.name = None
X_data, Y_data = proc_data.preprocessing_data(df)
treat = Y_data.treat.values
X = X_data.values
y = Y_data.target.values
p = Y_data[Y_data.treat == 1].shape[0] / Y_data.shape[0]
display(X_data.head()) 
display(Y_data.head()) 
gc.collect()
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)

from sklift.viz import plot_qini_curve, plot_uplift_curve, plot_uplift_by_percentile
import matplotlib.pyplot as plt
print(p)
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 var_10 var_11 var_12 var_13 var_14 var_15 var_16 var_17 var_18 var_19 var_20 var_21 var_22 var_23 var_24 var_25 var_26 var_27 var_28 var_29 var_30 var_31 var_32 var_33 var_34 var_35 var_36 var_37 var_38 var_39 var_40 var_41 var_42 var_43 var_44 var_45 var_46 var_47 var_48 var_49
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.33 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 1.025470 -0.566303 -0.422688 -0.364185 -0.122348 1.170560 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
1 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.61 0.188076 0.494648 -0.194261 1.211479 0.494648 -0.194261 -0.644546 -0.566303 -0.422688 0.163204 -0.122348 -0.717262 -0.581921 -0.07909 0.485022 -0.07909 0.485022 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.36 0.463681 -0.473833 1.238512 -0.607338 -0.473833 1.238512 -0.250992 1.127489 -0.243014 -0.297063 -0.122348 -0.177884 1.274518 -0.07909 -0.450530 -0.07909 -0.450530 1.788794 -0.219971 0.657498 -0.158142 -0.189061 -0.308908 1.741966 -0.189340 0.918137 -0.209564 0.243203 -0.023623 -1.046476 0.498922 0.0 0.498922
3 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.36 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 0.507636 -0.566303 -0.422688 -0.364185 -0.122348 0.631182 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
4 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.57 -0.577494 -0.152779 -0.194261 0.075395 -0.152779 -0.194261 -0.727400 0.531186 -0.328768 -0.143641 -0.122348 -0.717262 0.620956 -0.07909 -0.138680 -0.07909 -0.138680 0.929366 0.390969 0.657498 -0.158142 -0.189061 -0.308908 0.899231 0.556416 0.918137 0.394758 0.243203 0.344056 -1.046476 0.122757 0.0 0.122757
target treat
0 1 1
1 0 1
2 0 0
3 0 1
4 1 1
0.9044035089907759
In [ ]:
#Default data
skf = StratifiedKFold(n_splits=5, random_state= 42, shuffle=True)
skf.get_n_splits(X, y)
metrics = {'uplift_k_group':[],  'uplift_k_overall':[], 'qini_score':[], 'uplift_score':[]}
i = 0


for train_index, test_index in skf.split(X, y):
      gc.collect()
      X_train_, X_test_ = X[train_index], X[test_index]
      y_train_, y_test_ = y[train_index], y[test_index]
      catboost_reg = CatBoostRegressor(verbose = 0)
      clf = ClassTransformationReg(estimator=catboost_reg, propensity_val=p)
      clf.fit(X_train_, y_train_, treatment = treat[train_index])
      y_pred = clf.predict(X_test_)
      del clf
      del X_train_
      del y_train_
      del X_test_
      gc.collect()
      uplift_overall, uplift_group, qini_score, uplift_score = get_report(y_test_, treat[test_index], y_pred, f"Plots/Trasnform Regression model Split {i}.png")
      metrics['uplift_k_group'].append(uplift_group)
      metrics['uplift_k_overall'].append(uplift_overall)
      metrics[ 'qini_score'].append(qini_score)
      metrics['uplift_score'].append(uplift_score)
      i += 1
      gc.collect()


uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.008744, grouped - 0.009386.
Qini AUC Score: 0.001756; UpLift AUC Score: 0.001067
UpLift at 30%: overall - 0.007488, grouped - 0.008201.
Qini AUC Score: 0.000591; UpLift AUC Score: 0.000327
UpLift at 30%: overall - 0.00845, grouped - 0.009176.
Qini AUC Score: 0.004376; UpLift AUC Score: 0.002731
UpLift at 30%: overall - 0.009194, grouped - 0.009799.
Qini AUC Score: 0.008482; UpLift AUC Score: 0.005354
UpLift at 30%: overall - 0.007732, grouped - 0.008318.
Qini AUC Score: 0.002557; UpLift AUC Score: 0.001582
UpLift at 30%: overall - 0.0083, grouped - 0.009.
Qini AUC Score: 0.0036; UpLift AUC Score: 0.0022
In [ ]:
uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.0083, grouped - 0.009.
Qini AUC Score: 0.0036; UpLift AUC Score: 0.0022
In [ ]:
del X_data
del Y_data
del X
del treat
#del df
del y
gc.collect()
Out[ ]:
37343

AutoML Class¶

Search PipeLine¶

In [ ]:
# df= pd.read_csv(r'uplift_dataset.zip'
#                     , sep = ';'
#                     , encoding = 'ANSI'
#                     , dtype = {'NPL':str
#                             , 'SEGMENT_N':str}, index_col=['NPL']).drop(['FEDERAL_DATE', 'Group', 'Send_DAte']  + colls_to_drop, axis = 1)#.set_index('NPL')
# df.index.name = None
proc_data = PrepocessingDataSetLetu()
proc_data.remove_colls(colls_to_drop)
X_data, Y_data = proc_data.preprocessing_data_solomodel(df)
treat = Y_data.treat.values
X = X_data.values
y = Y_data.target.values
display(X_data.head()) 
display(Y_data.head()) 
gc.collect()
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)

from sklift.viz import plot_qini_curve, plot_uplift_curve, plot_uplift_by_percentile
import matplotlib.pyplot as plt
print(X.shape)
print(y.shape)
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 var_10 var_11 var_12 var_13 var_14 var_15 var_16 var_17 var_18 var_19 var_20 var_21 var_22 var_23 var_24 var_25 var_26 var_27 var_28 var_29 var_30 var_31 var_32 var_33 var_34 var_35 var_36 var_37 var_38 var_39 var_40 var_41 var_42 var_43 var_44 var_45 var_46 var_47 var_48 var_49 var_50 var_51
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.33 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 1.025470 -0.566303 -0.422688 -0.364185 -0.122348 1.170560 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000 0.0 1.0
1 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.61 0.188076 0.494648 -0.194261 1.211479 0.494648 -0.194261 -0.644546 -0.566303 -0.422688 0.163204 -0.122348 -0.717262 -0.581921 -0.07909 0.485022 -0.07909 0.485022 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000 0.0 1.0
2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.36 0.463681 -0.473833 1.238512 -0.607338 -0.473833 1.238512 -0.250992 1.127489 -0.243014 -0.297063 -0.122348 -0.177884 1.274518 -0.07909 -0.450530 -0.07909 -0.450530 1.788794 -0.219971 0.657498 -0.158142 -0.189061 -0.308908 1.741966 -0.189340 0.918137 -0.209564 0.243203 -0.023623 -1.046476 0.498922 0.0 0.498922 1.0 0.0
3 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.36 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 0.507636 -0.566303 -0.422688 -0.364185 -0.122348 0.631182 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000 0.0 1.0
4 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.57 -0.577494 -0.152779 -0.194261 0.075395 -0.152779 -0.194261 -0.727400 0.531186 -0.328768 -0.143641 -0.122348 -0.717262 0.620956 -0.07909 -0.138680 -0.07909 -0.138680 0.929366 0.390969 0.657498 -0.158142 -0.189061 -0.308908 0.899231 0.556416 0.918137 0.394758 0.243203 0.344056 -1.046476 0.122757 0.0 0.122757 0.0 1.0
target treat
0 1 1
1 0 1
2 0 0
3 0 1
4 1 1
(10341321, 52)
(10341321,)
In [ ]:
import evalml
from evalml import AutoMLSearch
In [ ]:
from evalml.objectives import get_optimization_objectives
from evalml.problem_types import ProblemTypes

for objective in get_optimization_objectives(ProblemTypes.BINARY):
    print(objective.name)
MCC Binary
Log Loss Binary
Gini
AUC
Precision
F1
Balanced Accuracy Binary
Accuracy Binary
In [ ]:
X_train,X_test,y_train,y_test = evalml.preprocessing.utils.split_data(X, y, problem_type="binary", test_size = 1e-6)
In [ ]:
X_train.shape, X_test.shape
Out[ ]:
((10341310, 52), (11, 52))
In [ ]:
automl=AutoMLSearch(X_train = X_train
                    , y_train = y_train
                    , n_jobs = -1
                    , problem_type='binary'
                    , patience= 10
                    , verbose = True
                    , tolerance= 1e-12
                    , allow_long_running_models = True
                    , max_batches = 30
                    , objective = 'Gini'
                    , optimize_thresholds=True
                    , train_best_pipeline= True
                    , ensembling=False
                    , max_iterations = 30
                    , holdout_set_size= 0
                    #, allowed_model_families = evalml.pipelines.components.utils.allowed_model_families('binary')
                    )


automl.search()
AutoMLSearch will use mean CV score to rank pipelines.

*****************************
* Beginning pipeline search *
*****************************

Optimizing for Gini. 
Greater score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 30 batches for a total of 30 pipelines. 
Allowed model families: 

FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…
Evaluating Baseline Pipeline: Mode Baseline Binary Classification Pipeline
Mode Baseline Binary Classification Pipeline:
	Starting cross validation
	Finished cross validation - mean Gini: 0.000

*****************************
* Evaluating Batch Number 1 *
*****************************

Logistic Regression Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Gini: 0.332
Random Forest Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Gini: 0.386

*****************************
* Evaluating Batch Number 2 *
*****************************

Logistic Regression Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Standard Scaler + RF Classifier Select From Model:
	Starting cross validation
	Finished cross validation - mean Gini: 0.265
Random Forest Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + RF Classifier Select From Model:
	Starting cross validation
	Finished cross validation - mean Gini: 0.362

*****************************
* Evaluating Batch Number 3 *
*****************************

Decision Tree Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.347
LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.414
Extra Trees Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.263
Elastic Net Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Standard Scaler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.264
CatBoost Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.337
XGBoost Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.410


10 iterations without improvement. Stopping search early...

*****************************
* Evaluating Batch Number 4 *
*****************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.367


10 iterations without improvement. Stopping search early...

*****************************
* Evaluating Batch Number 5 *
*****************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.387


10 iterations without improvement. Stopping search early...

*****************************
* Evaluating Batch Number 6 *
*****************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.420


10 iterations without improvement. Stopping search early...

*****************************
* Evaluating Batch Number 7 *
*****************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.395


10 iterations without improvement. Stopping search early...

*****************************
* Evaluating Batch Number 8 *
*****************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.392


10 iterations without improvement. Stopping search early...

*****************************
* Evaluating Batch Number 9 *
*****************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.379


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 10 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.398


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 11 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.404


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 12 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.410


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 13 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.365


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 14 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.417


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 15 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.363


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 16 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.414


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 17 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.323


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 18 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.390


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 19 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.417


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 20 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.406


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 21 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.339


10 iterations without improvement. Stopping search early...

******************************
* Evaluating Batch Number 22 *
******************************

LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean Gini: 0.340

Search finished after 1:09:22          
Best pipeline: LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer
Best pipeline Gini: 0.419852
Out[ ]:
{1: {'Logistic Regression Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Standard Scaler': '04:06',
  'Random Forest Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler': '02:46',
  'Total time of batch': '06:56'},
 2: {'Logistic Regression Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Standard Scaler + RF Classifier Select From Model': '03:09',
  'Random Forest Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + RF Classifier Select From Model': '02:48',
  'Total time of batch': '06:01'},
 3: {'Decision Tree Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '01:59',
  'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:03',
  'Extra Trees Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:20',
  'Elastic Net Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Standard Scaler + Select Columns Transformer': '03:35',
  'CatBoost Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '01:57',
  'XGBoost Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:39',
  'Total time of batch': '14:46'},
 4: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '01:57',
  'Total time of batch': '02:06'},
 5: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:03',
  'Total time of batch': '02:03'},
 6: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:14',
  'Total time of batch': '02:15'},
 7: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:04',
  'Total time of batch': '02:04'},
 8: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:10',
  'Total time of batch': '02:10'},
 9: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '01:59',
  'Total time of batch': '02:00'},
 10: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:19',
  'Total time of batch': '02:19'},
 11: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:15',
  'Total time of batch': '02:15'},
 12: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:01',
  'Total time of batch': '02:02'},
 13: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:01',
  'Total time of batch': '02:01'},
 14: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:04',
  'Total time of batch': '02:04'},
 15: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:14',
  'Total time of batch': '02:14'},
 16: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:26',
  'Total time of batch': '02:26'},
 17: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:07',
  'Total time of batch': '02:07'},
 18: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:18',
  'Total time of batch': '02:18'},
 19: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:07',
  'Total time of batch': '02:07'},
 20: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '02:16',
  'Total time of batch': '02:17'},
 21: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '01:59',
  'Total time of batch': '01:59'},
 22: {'LightGBM Classifier w/ Label Encoder + Replace Nullable Types Transformer + Imputer + Undersampler + Select Columns Transformer': '01:59',
  'Total time of batch': '01:59'}}
In [ ]:
automl.save('automl_big_searching_class.cloudpickle')
In [ ]:
del X_data
del Y_data
del X
del treat
#del df
del y
gc.collect()
Out[ ]:
147

Train Best Pipeline¶

In [ ]:
import evalml
from evalml import AutoMLSearch
automl = AutoMLSearch.load('automl_big_searching_class.cloudpickle')
In [ ]:
ranks = automl.rankings
ranks = ranks.drop(['id', 'search_order'	,'mean_cv_score'	,'standard_deviation_cv_score', 'high_variance_cv'], axis = 1)
In [ ]:
ranks.to_csv('evalml_class_res.csv', index = False)
In [ ]:
ranks.head(10)
Out[ ]:
pipeline_name ranking_score percent_better_than_baseline parameters
0 LightGBM Classifier w/ Label Encoder + Replace... 0.419852 inf {'Label Encoder': {'positive_label': None}, 'I...
5 XGBoost Classifier w/ Label Encoder + Replace ... 0.410464 inf {'Label Encoder': {'positive_label': None}, 'I...
14 Random Forest Classifier w/ Label Encoder + Re... 0.386029 inf {'Label Encoder': {'positive_label': None}, 'I...
19 Random Forest Classifier w/ Label Encoder + Re... 0.361650 inf {'Label Encoder': {'positive_label': None}, 'I...
20 Decision Tree Classifier w/ Label Encoder + Re... 0.346527 inf {'Label Encoder': {'positive_label': None}, 'I...
23 CatBoost Classifier w/ Label Encoder + Replace... 0.337224 inf {'Label Encoder': {'positive_label': None}, 'I...
24 Logistic Regression Classifier w/ Label Encode... 0.331530 inf {'Label Encoder': {'positive_label': None}, 'I...
26 Logistic Regression Classifier w/ Label Encode... 0.265303 inf {'Label Encoder': {'positive_label': None}, 'I...
27 Elastic Net Classifier w/ Label Encoder + Repl... 0.264182 inf {'Label Encoder': {'positive_label': None}, 'I...
28 Extra Trees Classifier w/ Label Encoder + Repl... 0.263340 inf {'Label Encoder': {'positive_label': None}, 'I...
In [ ]:
ranks.shape
Out[ ]:
(11, 4)
In [ ]:
ranks.head(10).to_excel('class_res.xlsx')
In [ ]:
ranks.iloc[0, :].parameters
Out[ ]:
{'Label Encoder': {'positive_label': None},
 'Imputer': {'categorical_impute_strategy': 'most_frequent',
  'numeric_impute_strategy': 'knn',
  'boolean_impute_strategy': 'knn',
  'categorical_fill_value': None,
  'numeric_fill_value': None,
  'boolean_fill_value': None},
 'Undersampler': {'sampling_ratio': 0.25,
  'min_samples': 100,
  'min_percentage': 0.1,
  'sampling_ratio_dict': None},
 'Select Columns Transformer': {'columns': [16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   31,
   33,
   34,
   35,
   37,
   38,
   41,
   43,
   44,
   45,
   47,
   48,
   49]},
 'LightGBM Classifier': {'boosting_type': 'dart',
  'learning_rate': 0.32830397998590266,
  'n_estimators': 90,
  'max_depth': 0,
  'num_leaves': 89,
  'min_child_samples': 80,
  'n_jobs': -1,
  'bagging_freq': 1,
  'bagging_fraction': 0.9223480594486513}}
In [ ]:
best_clf = automl.best_pipeline
type(best_clf)
Out[ ]:
evalml.pipelines.binary_classification_pipeline.BinaryClassificationPipeline
In [ ]:
best_clf = automl.get_pipeline(2)
type(best_clf)
Out[ ]:
evalml.pipelines.binary_classification_pipeline.BinaryClassificationPipeline
In [ ]:
# df= pd.read_csv(r'uplift_alls_dataset.zip'
#                     , sep = ';'
#                     , encoding = 'ANSI'
#                     , dtype = {'NPL':str
#                             , 'SEGMENT_N':str}, index_col=['NPL']).drop(['FEDERAL_DATE', 'Group', 'Send_DAte']  + colls_to_drop, axis = 1)#.set_index('NPL')
# df.index.name = None
proc_data = PrepocessingDataSetLetu()
proc_data.remove_colls(colls_to_drop)
X_data, Y_data = proc_data.preprocessing_data_solomodel(df)
treat = Y_data.treat.values
X = X_data.values
y = Y_data.target.values
display(X_data.head()) 
display(Y_data.head()) 
gc.collect()
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)

from sklift.viz import plot_qini_curve, plot_uplift_curve, plot_uplift_by_percentile
import matplotlib.pyplot as plt
print(X.shape)
print(y.shape)
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 var_10 var_11 var_12 var_13 var_14 var_15 var_16 var_17 var_18 var_19 var_20 var_21 var_22 var_23 var_24 var_25 var_26 var_27 var_28 var_29 var_30 var_31 var_32 var_33 var_34 var_35 var_36 var_37 var_38 var_39 var_40 var_41 var_42 var_43 var_44 var_45 var_46 var_47 var_48 var_49 var_50 var_51
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.33 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 1.025470 -0.566303 -0.422688 -0.364185 -0.122348 1.170560 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000 0.0 1.0
1 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.61 0.188076 0.494648 -0.194261 1.211479 0.494648 -0.194261 -0.644546 -0.566303 -0.422688 0.163204 -0.122348 -0.717262 -0.581921 -0.07909 0.485022 -0.07909 0.485022 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000 0.0 1.0
2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.36 0.463681 -0.473833 1.238512 -0.607338 -0.473833 1.238512 -0.250992 1.127489 -0.243014 -0.297063 -0.122348 -0.177884 1.274518 -0.07909 -0.450530 -0.07909 -0.450530 1.788794 -0.219971 0.657498 -0.158142 -0.189061 -0.308908 1.741966 -0.189340 0.918137 -0.209564 0.243203 -0.023623 -1.046476 0.498922 0.0 0.498922 1.0 0.0
3 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.36 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 0.507636 -0.566303 -0.422688 -0.364185 -0.122348 0.631182 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000 0.0 1.0
4 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.57 -0.577494 -0.152779 -0.194261 0.075395 -0.152779 -0.194261 -0.727400 0.531186 -0.328768 -0.143641 -0.122348 -0.717262 0.620956 -0.07909 -0.138680 -0.07909 -0.138680 0.929366 0.390969 0.657498 -0.158142 -0.189061 -0.308908 0.899231 0.556416 0.918137 0.394758 0.243203 0.344056 -1.046476 0.122757 0.0 0.122757 0.0 1.0
target treat
0 1 1
1 0 1
2 0 0
3 0 1
4 1 1
(10341321, 52)
(10341321,)
In [ ]:
import copy
skf = StratifiedKFold(n_splits=5, random_state= 42, shuffle=True)
skf.get_n_splits(X, y)
metrics = {'uplift_k_group':[],  'uplift_k_overall':[], 'qini_score':[], 'uplift_score':[]}
i = 0

for train_index, test_index in skf.split(X, y):
      gc.collect()
      X_train_, X_test_ = X[train_index], X[test_index]
      y_train_, y_test_ = y[train_index], y[test_index]
      #best_clf  = copy.deepcopy(automl.best_pipeline)
      best_clf  = copy.deepcopy(automl.best_pipeline)
      best_clf.fit(X_train_, y_train_)
      X_test_[:, -2:] = proc_data.ohe_solomodel.transform(np.zeros(shape = treat[test_index].shape).reshape(-1, 1)).toarray()
      y_pred0 = best_clf.predict_proba(X_test_)[[1]]
      X_test_[:, -2:] = proc_data.ohe_solomodel.transform(np.ones(shape = treat[test_index].shape).reshape(-1, 1)).toarray()
      y_pred1 = best_clf.predict_proba(X_test_)[[1]]
      y_pred = np.squeeze(y_pred1 - y_pred0).tolist()
      del X_train_
      del y_train_
      del X_test_
      gc.collect()
      uplift_overall, uplift_group, qini_score, uplift_score = get_report(y_test_, treat[test_index], y_pred, f"Plots/CLassification EvalML model Procentile Split Overall {i}.png")
      metrics['uplift_k_group'].append(uplift_group)
      metrics['uplift_k_overall'].append(uplift_overall)
      metrics[ 'qini_score'].append(qini_score)
      metrics['uplift_score'].append(uplift_score)
      i += 1
      gc.collect()


uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.00558, grouped - 0.005543.
Qini AUC Score: 0.0; UpLift AUC Score: 0.0
UpLift at 30%: overall - 0.006086, grouped - 0.006154.
Qini AUC Score: 0.0; UpLift AUC Score: 0.0
UpLift at 30%: overall - 0.005791, grouped - 0.005769.
Qini AUC Score: 0.0; UpLift AUC Score: 0.0
UpLift at 30%: overall - 0.004065, grouped - 0.004049.
Qini AUC Score: 0.0; UpLift AUC Score: 0.0
UpLift at 30%: overall - 0.004696, grouped - 0.004657.
Qini AUC Score: 0.0; UpLift AUC Score: 0.0
UpLift at 30%: overall - 0.0052, grouped - 0.0052.
Qini AUC Score: 0.0; UpLift AUC Score: 0.0
In [ ]:
uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.0052, grouped - 0.0052.
Qini AUC Score: 0.0; UpLift AUC Score: 0.0
In [ ]:
del X_data
del Y_data
del X
del treat
#del df
del y
gc.collect()
Out[ ]:
37398

AutoML Reg¶

Search PipeLine¶

In [ ]:
# df= pd.read_csv(r'uplift_dataset.zip'
#                     , sep = ';'
#                     , encoding = 'ANSI'
#                     , dtype = {'NPL':str
#                             , 'SEGMENT_N':str}, index_col=['NPL']).drop(['FEDERAL_DATE', 'Group', 'Send_DAte']  + colls_to_drop, axis = 1)#.set_index('NPL')
# df.index.name = None
X_data, Y_data = proc_data.preprocessing_data(df)
display(X_data.head()) 
display(Y_data.head()) 
gc.collect()
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)

from sklift.viz import plot_qini_curve, plot_uplift_curve, plot_uplift_by_percentile
import matplotlib.pyplot as plt

treat = Y_data.treat.values
targ = Y_data.target.values

p = Y_data[Y_data.treat == 1].shape[0] / Y_data.shape[0]
X = X_data.values
#X = (X - X.mean()) / X.std()
y = Y_data.target * (Y_data.treat - p) / (1- p) / p
y = y.values.astype(float)
features = X_data.columns.tolist()
gc.collect()
print(p)
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)

from sklift.viz import plot_qini_curve, plot_uplift_curve, plot_uplift_by_percentile
import matplotlib.pyplot as plt
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 var_10 var_11 var_12 var_13 var_14 var_15 var_16 var_17 var_18 var_19 var_20 var_21 var_22 var_23 var_24 var_25 var_26 var_27 var_28 var_29 var_30 var_31 var_32 var_33 var_34 var_35 var_36 var_37 var_38 var_39 var_40 var_41 var_42 var_43 var_44 var_45 var_46 var_47 var_48 var_49
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.33 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 1.025470 -0.566303 -0.422688 -0.364185 -0.122348 1.170560 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
1 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.61 0.188076 0.494648 -0.194261 1.211479 0.494648 -0.194261 -0.644546 -0.566303 -0.422688 0.163204 -0.122348 -0.717262 -0.581921 -0.07909 0.485022 -0.07909 0.485022 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.36 0.463681 -0.473833 1.238512 -0.607338 -0.473833 1.238512 -0.250992 1.127489 -0.243014 -0.297063 -0.122348 -0.177884 1.274518 -0.07909 -0.450530 -0.07909 -0.450530 1.788794 -0.219971 0.657498 -0.158142 -0.189061 -0.308908 1.741966 -0.189340 0.918137 -0.209564 0.243203 -0.023623 -1.046476 0.498922 0.0 0.498922
3 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.36 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 0.507636 -0.566303 -0.422688 -0.364185 -0.122348 0.631182 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
4 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.57 -0.577494 -0.152779 -0.194261 0.075395 -0.152779 -0.194261 -0.727400 0.531186 -0.328768 -0.143641 -0.122348 -0.717262 0.620956 -0.07909 -0.138680 -0.07909 -0.138680 0.929366 0.390969 0.657498 -0.158142 -0.189061 -0.308908 0.899231 0.556416 0.918137 0.394758 0.243203 0.344056 -1.046476 0.122757 0.0 0.122757
target treat
0 1 1
1 0 1
2 0 0
3 0 1
4 1 1
0.9044035089907759
In [ ]:
import evalml
from evalml import AutoMLSearch
In [ ]:
# from evalml.objectives import get_optimization_objectives
# from evalml.problem_types import ProblemTypes

# for objective in get_optimization_objectives(ProblemTypes.REGRESSION):
#     print(objective.name)
In [ ]:
X_train,X_test,y_train,y_test= evalml.preprocessing.utils.split_data(X, y, problem_type="regression", test_size = 1e-6)
In [ ]:
automl=AutoMLSearch(X_train = X_train
                    , y_train = y_train
                    , n_jobs = -1
                    , problem_type='regression'
                    , patience= 20
                    , verbose = True
                    , allow_long_running_models = True
                    , max_batches = 20
                    , objective = 'MSE'
                    , optimize_thresholds=True
                    , train_best_pipeline= True
                    , ensembling=False
                    , max_iterations = 20
                    , holdout_set_size= 0
                    #, allowed_model_families = evalml.pipelines.components.utils.allowed_model_families('binary')
                    )


automl.search()
AutoMLSearch will use mean CV score to rank pipelines.

*****************************
* Beginning pipeline search *
*****************************

Optimizing for MSE. 
Lower score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 20 batches for a total of 20 pipelines. 
Allowed model families: 

FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…
Evaluating Baseline Pipeline: Mean Baseline Regression Pipeline
Mean Baseline Regression Pipeline:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759

*****************************
* Evaluating Batch Number 1 *
*****************************

Elastic Net Regressor w/ Replace Nullable Types Transformer + Imputer + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
Random Forest Regressor w/ Replace Nullable Types Transformer + Imputer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759

*****************************
* Evaluating Batch Number 2 *
*****************************

Elastic Net Regressor w/ Replace Nullable Types Transformer + Imputer + Standard Scaler + RF Regressor Select From Model:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
Random Forest Regressor w/ Replace Nullable Types Transformer + Imputer + RF Regressor Select From Model:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759

*****************************
* Evaluating Batch Number 3 *
*****************************

Decision Tree Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
Extra Trees Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
XGBoost Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
CatBoost Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
LightGBM Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759

*****************************
* Evaluating Batch Number 4 *
*****************************

CatBoost Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
Elastic Net Regressor w/ Replace Nullable Types Transformer + Imputer + Standard Scaler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
Extra Trees Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
CatBoost Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.761
Elastic Net Regressor w/ Replace Nullable Types Transformer + Imputer + Standard Scaler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
Extra Trees Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
CatBoost Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
Elastic Net Regressor w/ Replace Nullable Types Transformer + Imputer + Standard Scaler + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
Extra Trees Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759
CatBoost Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer:
	Starting cross validation
	Finished cross validation - mean MSE: 0.759

Search finished after 1:41:44          
Best pipeline: CatBoost Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer
Best pipeline MSE: 0.758529
Out[ ]:
{1: {'Elastic Net Regressor w/ Replace Nullable Types Transformer + Imputer + Standard Scaler': '06:13',
  'Random Forest Regressor w/ Replace Nullable Types Transformer + Imputer': '11:55',
  'Total time of batch': '18:12'},
 2: {'Elastic Net Regressor w/ Replace Nullable Types Transformer + Imputer + Standard Scaler + RF Regressor Select From Model': '14:11',
  'Random Forest Regressor w/ Replace Nullable Types Transformer + Imputer + RF Regressor Select From Model': '17:49',
  'Total time of batch': '32:04'},
 3: {'Decision Tree Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer': '01:28',
  'Extra Trees Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer': '04:22',
  'XGBoost Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer': '04:37',
  'CatBoost Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer': '01:08',
  'LightGBM Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer': '01:08',
  'Total time of batch': '12:56'},
 4: {'CatBoost Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer': '01:26',
  'Elastic Net Regressor w/ Replace Nullable Types Transformer + Imputer + Standard Scaler + Select Columns Transformer': '02:08',
  'Extra Trees Regressor w/ Replace Nullable Types Transformer + Imputer + Select Columns Transformer': '15:35',
  'Total time of batch': '38:15'}}
In [ ]:
automl.save('automl_big_searching.cloudpickle')
In [ ]:
del X_data
del Y_data
del X_train
del y_train
del X_test
del y_test
del X
del treat
#del df
del y
gc.collect()
Out[ ]:
213

Train Best Pipeline¶

In [ ]:
import evalml
from evalml import AutoMLSearch
automl = AutoMLSearch.load('automl_big_searching.cloudpickle')
In [ ]:
ranks = automl.rankings
ranks = ranks.drop(['id', 'search_order'	,'mean_cv_score'	,'standard_deviation_cv_score', 'high_variance_cv'], axis = 1)
In [ ]:
ranks.to_csv('evalml_res.csv', index = False)
In [ ]:
ranks.head()
Out[ ]:
pipeline_name ranking_score percent_better_than_baseline parameters
0 CatBoost Regressor w/ Replace Nullable Types T... 0.758529 0.000604 {'Imputer': {'categorical_impute_strategy': 'm...
1 Mean Baseline Regression Pipeline 0.758534 0.000000 {'Baseline Regressor': {'strategy': 'mean'}}
2 Elastic Net Regressor w/ Replace Nullable Type... 0.758534 0.000000 {'Imputer': {'categorical_impute_strategy': 'm...
5 Elastic Net Regressor w/ Replace Nullable Type... 0.758536 -0.000234 {'Imputer': {'categorical_impute_strategy': 'm...
6 Elastic Net Regressor w/ Replace Nullable Type... 0.758536 -0.000313 {'Imputer': {'categorical_impute_strategy': 'm...
In [ ]:
ranks.shape
Out[ ]:
(11, 4)
In [ ]:
ranks.head(10).to_excel('res.xlsx')
In [ ]:
ranks.iloc[0, :].parameters
Out[ ]:
{'Imputer': {'categorical_impute_strategy': 'most_frequent',
  'numeric_impute_strategy': 'mean',
  'boolean_impute_strategy': 'most_frequent',
  'categorical_fill_value': None,
  'numeric_fill_value': None,
  'boolean_fill_value': None},
 'Select Columns Transformer': {'columns': [16,
   17,
   18,
   20,
   21,
   23,
   24,
   25,
   26,
   27,
   28,
   31,
   33,
   34,
   35,
   37,
   38,
   40,
   41,
   43,
   44,
   45,
   47,
   48,
   49]},
 'CatBoost Regressor': {'n_estimators': 10,
  'eta': 0.03,
  'max_depth': 6,
  'bootstrap_type': None,
  'silent': False,
  'allow_writing_files': False,
  'n_jobs': -1}}
In [ ]:
ctbst  = automl.best_pipeline
type(ctbst)
Out[ ]:
evalml.pipelines.regression_pipeline.RegressionPipeline
In [ ]:
# df= pd.read_csv(r'uplift_dataset.zip'
#                     , sep = ';'
#                     , encoding = 'ANSI'
#                     , dtype = {'NPL':str
#                             , 'SEGMENT_N':str}, index_col=['NPL']).drop(['FEDERAL_DATE', 'Group', 'Send_DAte'] + colls_to_drop, axis = 1)#.set_index('NPL')
# df.index.name = None
X_data, Y_data = proc_data.preprocessing_data(df)
display(X_data.head()) 
display(Y_data.head()) 
gc.collect()
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)

from sklift.viz import plot_qini_curve, plot_uplift_curve, plot_uplift_by_percentile
import matplotlib.pyplot as plt

treat = Y_data.treat.values
targ = Y_data.target.values

p = Y_data[Y_data.treat == 1].shape[0] / Y_data.shape[0]
X = X_data.values
#X = (X - X.mean()) / X.std()
y = Y_data.target * (Y_data.treat - p) / (1- p) / p
y = y.values
features = X_data.columns.tolist()
gc.collect()
print(p)
from sklift.metrics import (
    uplift_at_k, uplift_auc_score, qini_auc_score, weighted_average_uplift
)

from sklift.viz import plot_qini_curve, plot_uplift_curve, plot_uplift_by_percentile
import matplotlib.pyplot as plt
var_0 var_1 var_2 var_3 var_4 var_5 var_6 var_7 var_8 var_9 var_10 var_11 var_12 var_13 var_14 var_15 var_16 var_17 var_18 var_19 var_20 var_21 var_22 var_23 var_24 var_25 var_26 var_27 var_28 var_29 var_30 var_31 var_32 var_33 var_34 var_35 var_36 var_37 var_38 var_39 var_40 var_41 var_42 var_43 var_44 var_45 var_46 var_47 var_48 var_49
0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.33 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 1.025470 -0.566303 -0.422688 -0.364185 -0.122348 1.170560 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
1 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.61 0.188076 0.494648 -0.194261 1.211479 0.494648 -0.194261 -0.644546 -0.566303 -0.422688 0.163204 -0.122348 -0.717262 -0.581921 -0.07909 0.485022 -0.07909 0.485022 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.36 0.463681 -0.473833 1.238512 -0.607338 -0.473833 1.238512 -0.250992 1.127489 -0.243014 -0.297063 -0.122348 -0.177884 1.274518 -0.07909 -0.450530 -0.07909 -0.450530 1.788794 -0.219971 0.657498 -0.158142 -0.189061 -0.308908 1.741966 -0.189340 0.918137 -0.209564 0.243203 -0.023623 -1.046476 0.498922 0.0 0.498922
3 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.36 -1.128704 -0.575861 -0.910647 -0.667017 -0.575861 -0.910647 0.507636 -0.566303 -0.422688 -0.364185 -0.122348 0.631182 -0.581921 -0.07909 -0.586965 -0.07909 -0.586965 -0.652403 -0.389186 -0.619390 -0.158142 -0.189061 -0.308908 -0.651816 -0.395896 -0.628891 0.394758 0.243203 0.344056 -1.046476 0.000000 0.0 0.000000
4 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.57 -0.577494 -0.152779 -0.194261 0.075395 -0.152779 -0.194261 -0.727400 0.531186 -0.328768 -0.143641 -0.122348 -0.717262 0.620956 -0.07909 -0.138680 -0.07909 -0.138680 0.929366 0.390969 0.657498 -0.158142 -0.189061 -0.308908 0.899231 0.556416 0.918137 0.394758 0.243203 0.344056 -1.046476 0.122757 0.0 0.122757
target treat
0 1 1
1 0 1
2 0 0
3 0 1
4 1 1
0.9044035089907759
In [ ]:
import copy
skf = KFold(n_splits=5, random_state= 42, shuffle=True)
skf.get_n_splits(X, y)
metrics = {'uplift_k_group':[],  'uplift_k_overall':[], 'qini_score':[], 'uplift_score':[]}
i = 0

for train_index, test_index in skf.split(X, y):
      gc.collect()
      X_train_, X_test_ = X[train_index], X[test_index]
      y_train_, y_test_ = y[train_index], y[test_index]
      ctbst  = copy.deepcopy(automl.best_pipeline)
      ctbst.fit(X_train_, y_train_)
      y_pred = ctbst.predict(X_test_)
      del X_train_
      del y_train_ 
      del X_test_
      gc.collect()
      uplift_overall, uplift_group, qini_score, uplift_score = get_report(targ[test_index], treat[test_index], y_pred, f"Plots/Trasnform Regression EvalML model Split {i}.png")
      metrics['uplift_k_group'].append(uplift_group)
      metrics['uplift_k_overall'].append(uplift_overall)
      metrics['qini_score'].append(qini_score)
      metrics['uplift_score'].append(uplift_score)
      i += 1
      gc.collect()


uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.007693, grouped - 0.009018.
Qini AUC Score: 0.005869; UpLift AUC Score: 0.003681
UpLift at 30%: overall - 0.010423, grouped - 0.01159.
Qini AUC Score: 0.011856; UpLift AUC Score: 0.007474
UpLift at 30%: overall - 0.012475, grouped - 0.013267.
Qini AUC Score: 0.01462; UpLift AUC Score: 0.009186
UpLift at 30%: overall - 0.010282, grouped - 0.011502.
Qini AUC Score: 0.012019; UpLift AUC Score: 0.007549
UpLift at 30%: overall - 0.010825, grouped - 0.012257.
Qini AUC Score: 0.012519; UpLift AUC Score: 0.007888
UpLift at 30%: overall - 0.0103, grouped - 0.0115.
Qini AUC Score: 0.0114; UpLift AUC Score: 0.0072
In [ ]:
uplift_overall = np.round(np.mean(metrics['uplift_k_overall']), 4)
uplift_group = np.round(np.mean(metrics['uplift_k_group']), 4)
qini_score = np.round(np.mean(metrics['qini_score']), 4)
uplift_score= np.round(np.mean(metrics['uplift_score']), 4)
print(f'UpLift at 30%: overall - {uplift_overall}, grouped - {uplift_group}.\nQini AUC Score: {qini_score}; UpLift AUC Score: {uplift_score}')
UpLift at 30%: overall - 0.0103, grouped - 0.0115.
Qini AUC Score: 0.0114; UpLift AUC Score: 0.0072
In [ ]:
del X_data
del Y_data
del X
del treat
del df
del y
gc.collect()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[70], line 1
----> 1 del X_data
      2 del Y_data
      3 del X

NameError: name 'X_data' is not defined
In [ ]: